import os
import requests
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# URL of file
url = "https://www.kaggle.com/datasets/shalfey/extended-crab-age-prediction/download?datasetVersionNumber=1"
# File path to save the downloaded file
file_path = "Crabs.csv"
def download_file(url, file_path):
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(file_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
print("File downloaded successfully.")
else:
print("Failed to download file.")
def check_and_download_file(url, file_path):
if not os.path.isfile(file_path):
download_file(url, file_path)
else:
print("File already exists.")
# Check if the file exists and download it if necessary
check_and_download_file(url, file_path)
File already exists.
df = pd.read_csv(file_path)
df.replace(0, float('nan'), inplace=True)
df.dropna(inplace=True)
# Get the count of NA values in each column
na_counts = df.isna().sum()
null_counts = df.isnull().sum()
print("NA counts:")
print(na_counts)
print("Null counts:")
print(null_counts)
NA counts: id 0 Sex 0 Length 0 Diameter 0 Height 0 Weight 0 Shucked Weight 0 Viscera Weight 0 Shell Weight 0 Age 0 dtype: int64 Null counts: id 0 Sex 0 Length 0 Diameter 0 Height 0 Weight 0 Shucked Weight 0 Viscera Weight 0 Shell Weight 0 Age 0 dtype: int64
Because many of the variables will be colinear, I want to produce two calculated variables called Volume and Density
# Volume
df['Volume'] = np.pi * (df['Diameter'] / 2)**2 * df['Height']
# Density
df['Density'] = df['Weight'] / df['Volume']
print(df.shape)
(199852, 12)
# info
print(df.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 199852 entries, 1 to 199999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 199852 non-null float64 1 Sex 199852 non-null object 2 Length 199852 non-null float64 3 Diameter 199852 non-null float64 4 Height 199852 non-null float64 5 Weight 199852 non-null float64 6 Shucked Weight 199852 non-null float64 7 Viscera Weight 199852 non-null float64 8 Shell Weight 199852 non-null float64 9 Age 199852 non-null float64 10 Volume 199852 non-null float64 11 Density 199852 non-null float64 dtypes: float64(11), object(1) memory usage: 19.8+ MB None
df.head()
| id | Sex | Length | Diameter | Height | Weight | Shucked Weight | Viscera Weight | Shell Weight | Age | Volume | Density | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1.0 | I | 1.2375 | 1.0000 | 0.3750 | 21.885814 | 7.654365 | 3.798833 | 7.654365 | 19.0 | 0.294524 | 74.309024 |
| 2 | 2.0 | F | 1.4500 | 1.1625 | 0.4125 | 28.250277 | 11.127179 | 7.016501 | 7.257472 | 11.0 | 0.437824 | 64.524248 |
| 3 | 3.0 | I | 1.3500 | 1.0250 | 0.3750 | 21.588144 | 9.738053 | 4.110678 | 6.378637 | 9.0 | 0.309435 | 69.766419 |
| 4 | 4.0 | I | 1.1375 | 0.8750 | 0.2875 | 14.968536 | 5.953395 | 2.962523 | 3.713785 | 8.0 | 0.172880 | 86.583570 |
| 5 | 5.0 | F | 1.4875 | 1.1875 | 0.4000 | 28.335325 | 12.048538 | 7.668540 | 8.504850 | 10.0 | 0.443014 | 63.960388 |
# Descriptive statistics
print(df.describe())
id Length Diameter Height \
count 199852.000000 199852.000000 199852.000000 199852.000000
mean 99996.314633 1.313000 1.020788 0.346273
std 57734.152658 0.289181 0.238135 0.090756
min 1.000000 0.187500 0.112500 0.012500
25% 49996.750000 1.150000 0.875000 0.287500
50% 99999.500000 1.375000 1.075000 0.362500
75% 149994.250000 1.525000 1.200000 0.412500
max 199999.000000 7.583491 2.250000 2.825000
Weight Shucked Weight Viscera Weight Shell Weight \
count 199852.000000 199852.000000 199852.000000 199852.000000
mean 23.138912 9.996106 4.996516 6.638642
std 12.589390 5.600855 2.783996 3.554264
min 0.028349 0.014175 0.014175 0.042524
25% 13.168343 5.669900 2.806601 3.827183
50% 23.530085 9.851451 4.890289 6.803880
75% 32.077459 13.933779 6.959802 9.029316
max 80.101512 45.274152 26.124064 37.038622
Age Volume Density
count 199852.000000 199852.000000 199852.000000
mean 9.954897 0.328318 73.311126
std 3.214314 0.186511 13.717132
min 1.000000 0.000249 4.242654
25% 8.000000 0.179841 66.508786
50% 10.000000 0.330064 71.818174
75% 11.000000 0.461725 77.949224
max 29.000000 1.898418 1654.163642
threshold = 3
z_scores = (df['Density'] - df['Density'].mean()) / df['Density'].std()
# Identify the outliers using the threshold
outliers = df[abs(z_scores) > threshold]
# Remove the outliers from the DataFrame
df = df.drop(outliers.index)
# Reset the index of the DataFrame
df = df.reset_index(drop=True)
# setup
variables = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Volume', 'Density']
ages = np.sort(df['Age'].unique())
grouped = df.groupby(['Sex', 'Age'])
for age in ages:
print(f"\nAge: {age}")
print("-------------------------")
group = df[df['Age'] == age]
for var in variables:
group1 = group[group['Sex'] == 'M'][var]
group2 = group[group['Sex'] == 'F'][var]
# Check if there are sufficient data points in each group
if len(group1) < 2 or len(group2) < 2:
print(f"For Age={age}, Variable={var}: Insufficient data")
continue
t_statistic, p_value = stats.ttest_ind(group1, group2)
print(f"Age={age}, Variable={var}: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
if p_value < 0.05:
print("Reject null")
Age: 1.0 ------------------------- For Age=1.0, Variable=Length: Insufficient data For Age=1.0, Variable=Diameter: Insufficient data For Age=1.0, Variable=Height: Insufficient data For Age=1.0, Variable=Weight: Insufficient data For Age=1.0, Variable=Shucked Weight: Insufficient data For Age=1.0, Variable=Viscera Weight: Insufficient data For Age=1.0, Variable=Shell Weight: Insufficient data For Age=1.0, Variable=Volume: Insufficient data For Age=1.0, Variable=Density: Insufficient data Age: 2.0 ------------------------- For Age=2.0, Variable=Length: Insufficient data For Age=2.0, Variable=Diameter: Insufficient data For Age=2.0, Variable=Height: Insufficient data For Age=2.0, Variable=Weight: Insufficient data For Age=2.0, Variable=Shucked Weight: Insufficient data For Age=2.0, Variable=Viscera Weight: Insufficient data For Age=2.0, Variable=Shell Weight: Insufficient data For Age=2.0, Variable=Volume: Insufficient data For Age=2.0, Variable=Density: Insufficient data Age: 3.0 ------------------------- For Age=3.0, Variable=Length: Insufficient data For Age=3.0, Variable=Diameter: Insufficient data For Age=3.0, Variable=Height: Insufficient data For Age=3.0, Variable=Weight: Insufficient data For Age=3.0, Variable=Shucked Weight: Insufficient data For Age=3.0, Variable=Viscera Weight: Insufficient data For Age=3.0, Variable=Shell Weight: Insufficient data For Age=3.0, Variable=Volume: Insufficient data For Age=3.0, Variable=Density: Insufficient data Age: 4.0 ------------------------- Age=4.0, Variable=Length: t-statistic=0.022, p-value=0.983 Age=4.0, Variable=Diameter: t-statistic=0.440, p-value=0.661 Age=4.0, Variable=Height: t-statistic=0.949, p-value=0.346 Age=4.0, Variable=Weight: t-statistic=0.373, p-value=0.710 Age=4.0, Variable=Shucked Weight: t-statistic=0.514, p-value=0.609 Age=4.0, Variable=Viscera Weight: t-statistic=0.371, p-value=0.712 Age=4.0, Variable=Shell Weight: t-statistic=0.374, p-value=0.709 Age=4.0, Variable=Volume: t-statistic=0.631, p-value=0.530 Age=4.0, Variable=Density: t-statistic=-1.188, p-value=0.239 Age: 5.0 ------------------------- Age=5.0, Variable=Length: t-statistic=-2.918, p-value=0.004 Reject null Age=5.0, Variable=Diameter: t-statistic=-2.689, p-value=0.008 Reject null Age=5.0, Variable=Height: t-statistic=-1.951, p-value=0.052 Age=5.0, Variable=Weight: t-statistic=-2.999, p-value=0.003 Reject null Age=5.0, Variable=Shucked Weight: t-statistic=-1.903, p-value=0.058 Age=5.0, Variable=Viscera Weight: t-statistic=-3.201, p-value=0.002 Reject null Age=5.0, Variable=Shell Weight: t-statistic=-3.737, p-value=0.000 Reject null Age=5.0, Variable=Volume: t-statistic=-2.459, p-value=0.015 Reject null Age=5.0, Variable=Density: t-statistic=-1.446, p-value=0.149 Age: 6.0 ------------------------- Age=6.0, Variable=Length: t-statistic=-8.084, p-value=0.000 Reject null Age=6.0, Variable=Diameter: t-statistic=-8.110, p-value=0.000 Reject null Age=6.0, Variable=Height: t-statistic=-7.552, p-value=0.000 Reject null Age=6.0, Variable=Weight: t-statistic=-7.307, p-value=0.000 Reject null Age=6.0, Variable=Shucked Weight: t-statistic=-6.915, p-value=0.000 Reject null Age=6.0, Variable=Viscera Weight: t-statistic=-7.676, p-value=0.000 Reject null Age=6.0, Variable=Shell Weight: t-statistic=-7.328, p-value=0.000 Reject null Age=6.0, Variable=Volume: t-statistic=-7.531, p-value=0.000 Reject null Age=6.0, Variable=Density: t-statistic=1.024, p-value=0.306 Age: 7.0 ------------------------- Age=7.0, Variable=Length: t-statistic=-10.670, p-value=0.000 Reject null Age=7.0, Variable=Diameter: t-statistic=-10.882, p-value=0.000 Reject null Age=7.0, Variable=Height: t-statistic=-10.272, p-value=0.000 Reject null Age=7.0, Variable=Weight: t-statistic=-9.683, p-value=0.000 Reject null Age=7.0, Variable=Shucked Weight: t-statistic=-8.757, p-value=0.000 Reject null Age=7.0, Variable=Viscera Weight: t-statistic=-10.465, p-value=0.000 Reject null Age=7.0, Variable=Shell Weight: t-statistic=-9.789, p-value=0.000 Reject null Age=7.0, Variable=Volume: t-statistic=-9.693, p-value=0.000 Reject null Age=7.0, Variable=Density: t-statistic=2.349, p-value=0.019 Reject null Age: 8.0 ------------------------- Age=8.0, Variable=Length: t-statistic=-8.738, p-value=0.000 Reject null Age=8.0, Variable=Diameter: t-statistic=-9.103, p-value=0.000 Reject null Age=8.0, Variable=Height: t-statistic=-9.177, p-value=0.000 Reject null Age=8.0, Variable=Weight: t-statistic=-7.967, p-value=0.000 Reject null Age=8.0, Variable=Shucked Weight: t-statistic=-6.716, p-value=0.000 Reject null Age=8.0, Variable=Viscera Weight: t-statistic=-8.722, p-value=0.000 Reject null Age=8.0, Variable=Shell Weight: t-statistic=-9.235, p-value=0.000 Reject null Age=8.0, Variable=Volume: t-statistic=-8.935, p-value=0.000 Reject null Age=8.0, Variable=Density: t-statistic=4.722, p-value=0.000 Reject null Age: 9.0 ------------------------- Age=9.0, Variable=Length: t-statistic=-13.077, p-value=0.000 Reject null Age=9.0, Variable=Diameter: t-statistic=-13.152, p-value=0.000 Reject null Age=9.0, Variable=Height: t-statistic=-13.283, p-value=0.000 Reject null Age=9.0, Variable=Weight: t-statistic=-12.271, p-value=0.000 Reject null Age=9.0, Variable=Shucked Weight: t-statistic=-10.398, p-value=0.000 Reject null Age=9.0, Variable=Viscera Weight: t-statistic=-13.269, p-value=0.000 Reject null Age=9.0, Variable=Shell Weight: t-statistic=-13.800, p-value=0.000 Reject null Age=9.0, Variable=Volume: t-statistic=-13.072, p-value=0.000 Reject null Age=9.0, Variable=Density: t-statistic=4.224, p-value=0.000 Reject null Age: 10.0 ------------------------- Age=10.0, Variable=Length: t-statistic=-8.306, p-value=0.000 Reject null Age=10.0, Variable=Diameter: t-statistic=-8.575, p-value=0.000 Reject null Age=10.0, Variable=Height: t-statistic=-10.274, p-value=0.000 Reject null Age=10.0, Variable=Weight: t-statistic=-8.655, p-value=0.000 Reject null Age=10.0, Variable=Shucked Weight: t-statistic=-5.889, p-value=0.000 Reject null Age=10.0, Variable=Viscera Weight: t-statistic=-10.045, p-value=0.000 Reject null Age=10.0, Variable=Shell Weight: t-statistic=-9.865, p-value=0.000 Reject null Age=10.0, Variable=Volume: t-statistic=-10.343, p-value=0.000 Reject null Age=10.0, Variable=Density: t-statistic=6.362, p-value=0.000 Reject null Age: 11.0 ------------------------- Age=11.0, Variable=Length: t-statistic=-7.473, p-value=0.000 Reject null Age=11.0, Variable=Diameter: t-statistic=-7.284, p-value=0.000 Reject null Age=11.0, Variable=Height: t-statistic=-7.636, p-value=0.000 Reject null Age=11.0, Variable=Weight: t-statistic=-7.349, p-value=0.000 Reject null Age=11.0, Variable=Shucked Weight: t-statistic=-5.065, p-value=0.000 Reject null Age=11.0, Variable=Viscera Weight: t-statistic=-8.440, p-value=0.000 Reject null Age=11.0, Variable=Shell Weight: t-statistic=-8.430, p-value=0.000 Reject null Age=11.0, Variable=Volume: t-statistic=-8.163, p-value=0.000 Reject null Age=11.0, Variable=Density: t-statistic=2.824, p-value=0.005 Reject null Age: 12.0 ------------------------- Age=12.0, Variable=Length: t-statistic=-0.905, p-value=0.365 Age=12.0, Variable=Diameter: t-statistic=-1.032, p-value=0.302 Age=12.0, Variable=Height: t-statistic=-1.565, p-value=0.118 Age=12.0, Variable=Weight: t-statistic=-0.808, p-value=0.419 Age=12.0, Variable=Shucked Weight: t-statistic=0.672, p-value=0.501 Age=12.0, Variable=Viscera Weight: t-statistic=-0.803, p-value=0.422 Age=12.0, Variable=Shell Weight: t-statistic=-1.882, p-value=0.060 Age=12.0, Variable=Volume: t-statistic=-1.322, p-value=0.186 Age=12.0, Variable=Density: t-statistic=1.347, p-value=0.178 Age: 13.0 ------------------------- Age=13.0, Variable=Length: t-statistic=0.930, p-value=0.352 Age=13.0, Variable=Diameter: t-statistic=0.697, p-value=0.486 Age=13.0, Variable=Height: t-statistic=-0.753, p-value=0.452 Age=13.0, Variable=Weight: t-statistic=0.904, p-value=0.366 Age=13.0, Variable=Shucked Weight: t-statistic=2.153, p-value=0.031 Reject null Age=13.0, Variable=Viscera Weight: t-statistic=-1.045, p-value=0.296 Age=13.0, Variable=Shell Weight: t-statistic=0.445, p-value=0.656 Age=13.0, Variable=Volume: t-statistic=-0.188, p-value=0.851 Age=13.0, Variable=Density: t-statistic=2.691, p-value=0.007 Reject null Age: 14.0 ------------------------- Age=14.0, Variable=Length: t-statistic=-3.521, p-value=0.000 Reject null Age=14.0, Variable=Diameter: t-statistic=-3.379, p-value=0.001 Reject null Age=14.0, Variable=Height: t-statistic=-3.243, p-value=0.001 Reject null Age=14.0, Variable=Weight: t-statistic=-2.913, p-value=0.004 Reject null Age=14.0, Variable=Shucked Weight: t-statistic=-1.508, p-value=0.132 Age=14.0, Variable=Viscera Weight: t-statistic=-4.498, p-value=0.000 Reject null Age=14.0, Variable=Shell Weight: t-statistic=-3.345, p-value=0.001 Reject null Age=14.0, Variable=Volume: t-statistic=-3.674, p-value=0.000 Reject null Age=14.0, Variable=Density: t-statistic=2.355, p-value=0.019 Reject null Age: 15.0 ------------------------- Age=15.0, Variable=Length: t-statistic=0.779, p-value=0.436 Age=15.0, Variable=Diameter: t-statistic=1.064, p-value=0.288 Age=15.0, Variable=Height: t-statistic=0.800, p-value=0.423 Age=15.0, Variable=Weight: t-statistic=1.493, p-value=0.136 Age=15.0, Variable=Shucked Weight: t-statistic=1.995, p-value=0.046 Reject null Age=15.0, Variable=Viscera Weight: t-statistic=0.320, p-value=0.749 Age=15.0, Variable=Shell Weight: t-statistic=1.132, p-value=0.258 Age=15.0, Variable=Volume: t-statistic=0.811, p-value=0.418 Age=15.0, Variable=Density: t-statistic=1.631, p-value=0.103 Age: 16.0 ------------------------- Age=16.0, Variable=Length: t-statistic=-2.408, p-value=0.016 Reject null Age=16.0, Variable=Diameter: t-statistic=-2.184, p-value=0.029 Reject null Age=16.0, Variable=Height: t-statistic=-2.972, p-value=0.003 Reject null Age=16.0, Variable=Weight: t-statistic=-2.263, p-value=0.024 Reject null Age=16.0, Variable=Shucked Weight: t-statistic=-1.673, p-value=0.094 Age=16.0, Variable=Viscera Weight: t-statistic=-2.712, p-value=0.007 Reject null Age=16.0, Variable=Shell Weight: t-statistic=-1.955, p-value=0.051 Age=16.0, Variable=Volume: t-statistic=-2.759, p-value=0.006 Reject null Age=16.0, Variable=Density: t-statistic=2.423, p-value=0.015 Reject null Age: 17.0 ------------------------- Age=17.0, Variable=Length: t-statistic=-3.403, p-value=0.001 Reject null Age=17.0, Variable=Diameter: t-statistic=-3.311, p-value=0.001 Reject null Age=17.0, Variable=Height: t-statistic=-2.733, p-value=0.006 Reject null Age=17.0, Variable=Weight: t-statistic=-2.940, p-value=0.003 Reject null Age=17.0, Variable=Shucked Weight: t-statistic=-2.365, p-value=0.018 Reject null Age=17.0, Variable=Viscera Weight: t-statistic=-2.782, p-value=0.005 Reject null Age=17.0, Variable=Shell Weight: t-statistic=-2.794, p-value=0.005 Reject null Age=17.0, Variable=Volume: t-statistic=-3.327, p-value=0.001 Reject null Age=17.0, Variable=Density: t-statistic=0.996, p-value=0.319 Age: 18.0 ------------------------- Age=18.0, Variable=Length: t-statistic=-2.309, p-value=0.021 Reject null Age=18.0, Variable=Diameter: t-statistic=-2.286, p-value=0.022 Reject null Age=18.0, Variable=Height: t-statistic=-1.769, p-value=0.077 Age=18.0, Variable=Weight: t-statistic=-1.861, p-value=0.063 Age=18.0, Variable=Shucked Weight: t-statistic=-0.241, p-value=0.810 Age=18.0, Variable=Viscera Weight: t-statistic=-2.109, p-value=0.035 Reject null Age=18.0, Variable=Shell Weight: t-statistic=-2.654, p-value=0.008 Reject null Age=18.0, Variable=Volume: t-statistic=-2.062, p-value=0.039 Reject null Age=18.0, Variable=Density: t-statistic=0.321, p-value=0.749 Age: 19.0 ------------------------- Age=19.0, Variable=Length: t-statistic=-1.122, p-value=0.262 Age=19.0, Variable=Diameter: t-statistic=-1.342, p-value=0.180 Age=19.0, Variable=Height: t-statistic=-1.082, p-value=0.280 Age=19.0, Variable=Weight: t-statistic=-1.062, p-value=0.288 Age=19.0, Variable=Shucked Weight: t-statistic=-0.328, p-value=0.743 Age=19.0, Variable=Viscera Weight: t-statistic=-1.273, p-value=0.203 Age=19.0, Variable=Shell Weight: t-statistic=-1.101, p-value=0.271 Age=19.0, Variable=Volume: t-statistic=-1.326, p-value=0.185 Age=19.0, Variable=Density: t-statistic=0.738, p-value=0.461 Age: 20.0 ------------------------- Age=20.0, Variable=Length: t-statistic=-1.611, p-value=0.108 Age=20.0, Variable=Diameter: t-statistic=-1.846, p-value=0.065 Age=20.0, Variable=Height: t-statistic=-1.007, p-value=0.314 Age=20.0, Variable=Weight: t-statistic=-1.079, p-value=0.281 Age=20.0, Variable=Shucked Weight: t-statistic=-0.272, p-value=0.786 Age=20.0, Variable=Viscera Weight: t-statistic=-0.581, p-value=0.561 Age=20.0, Variable=Shell Weight: t-statistic=-1.329, p-value=0.184 Age=20.0, Variable=Volume: t-statistic=-1.490, p-value=0.136 Age=20.0, Variable=Density: t-statistic=1.201, p-value=0.230 Age: 21.0 ------------------------- Age=21.0, Variable=Length: t-statistic=-1.115, p-value=0.265 Age=21.0, Variable=Diameter: t-statistic=-1.146, p-value=0.252 Age=21.0, Variable=Height: t-statistic=-0.273, p-value=0.785 Age=21.0, Variable=Weight: t-statistic=-0.801, p-value=0.423 Age=21.0, Variable=Shucked Weight: t-statistic=-1.085, p-value=0.278 Age=21.0, Variable=Viscera Weight: t-statistic=-1.410, p-value=0.159 Age=21.0, Variable=Shell Weight: t-statistic=-0.282, p-value=0.778 Age=21.0, Variable=Volume: t-statistic=-0.729, p-value=0.466 Age=21.0, Variable=Density: t-statistic=-0.324, p-value=0.746 Age: 22.0 ------------------------- Age=22.0, Variable=Length: t-statistic=-0.310, p-value=0.757 Age=22.0, Variable=Diameter: t-statistic=-0.093, p-value=0.926 Age=22.0, Variable=Height: t-statistic=-0.112, p-value=0.911 Age=22.0, Variable=Weight: t-statistic=-0.128, p-value=0.898 Age=22.0, Variable=Shucked Weight: t-statistic=0.579, p-value=0.563 Age=22.0, Variable=Viscera Weight: t-statistic=-0.131, p-value=0.896 Age=22.0, Variable=Shell Weight: t-statistic=0.263, p-value=0.793 Age=22.0, Variable=Volume: t-statistic=0.006, p-value=0.995 Age=22.0, Variable=Density: t-statistic=0.001, p-value=0.999 Age: 23.0 ------------------------- Age=23.0, Variable=Length: t-statistic=-0.346, p-value=0.730 Age=23.0, Variable=Diameter: t-statistic=-0.030, p-value=0.976 Age=23.0, Variable=Height: t-statistic=-0.674, p-value=0.501 Age=23.0, Variable=Weight: t-statistic=-0.473, p-value=0.637 Age=23.0, Variable=Shucked Weight: t-statistic=0.194, p-value=0.846 Age=23.0, Variable=Viscera Weight: t-statistic=-0.998, p-value=0.319 Age=23.0, Variable=Shell Weight: t-statistic=-1.299, p-value=0.195 Age=23.0, Variable=Volume: t-statistic=-0.325, p-value=0.746 Age=23.0, Variable=Density: t-statistic=-0.059, p-value=0.953 Age: 24.0 ------------------------- Age=24.0, Variable=Length: t-statistic=-0.737, p-value=0.463 Age=24.0, Variable=Diameter: t-statistic=-0.562, p-value=0.575 Age=24.0, Variable=Height: t-statistic=-1.306, p-value=0.195 Age=24.0, Variable=Weight: t-statistic=-0.194, p-value=0.847 Age=24.0, Variable=Shucked Weight: t-statistic=-0.230, p-value=0.818 Age=24.0, Variable=Viscera Weight: t-statistic=-1.029, p-value=0.306 Age=24.0, Variable=Shell Weight: t-statistic=-0.001, p-value=0.999 Age=24.0, Variable=Volume: t-statistic=-0.948, p-value=0.345 Age=24.0, Variable=Density: t-statistic=1.902, p-value=0.060 Age: 25.0 ------------------------- Age=25.0, Variable=Length: t-statistic=-0.621, p-value=0.538 Age=25.0, Variable=Diameter: t-statistic=-0.424, p-value=0.673 Age=25.0, Variable=Height: t-statistic=-1.119, p-value=0.269 Age=25.0, Variable=Weight: t-statistic=-0.623, p-value=0.537 Age=25.0, Variable=Shucked Weight: t-statistic=-1.083, p-value=0.284 Age=25.0, Variable=Viscera Weight: t-statistic=0.105, p-value=0.917 Age=25.0, Variable=Shell Weight: t-statistic=-0.198, p-value=0.844 Age=25.0, Variable=Volume: t-statistic=-0.676, p-value=0.503 Age=25.0, Variable=Density: t-statistic=-0.093, p-value=0.927 Age: 26.0 ------------------------- Age=26.0, Variable=Length: t-statistic=-2.342, p-value=0.024 Reject null Age=26.0, Variable=Diameter: t-statistic=-2.610, p-value=0.013 Reject null Age=26.0, Variable=Height: t-statistic=-1.976, p-value=0.055 Age=26.0, Variable=Weight: t-statistic=-2.815, p-value=0.008 Reject null Age=26.0, Variable=Shucked Weight: t-statistic=-2.998, p-value=0.005 Reject null Age=26.0, Variable=Viscera Weight: t-statistic=-4.099, p-value=0.000 Reject null Age=26.0, Variable=Shell Weight: t-statistic=-2.197, p-value=0.034 Reject null Age=26.0, Variable=Volume: t-statistic=-2.438, p-value=0.019 Reject null Age=26.0, Variable=Density: t-statistic=-0.401, p-value=0.691 Age: 27.0 ------------------------- Age=27.0, Variable=Length: t-statistic=-0.426, p-value=0.671 Age=27.0, Variable=Diameter: t-statistic=-0.706, p-value=0.482 Age=27.0, Variable=Height: t-statistic=0.947, p-value=0.346 Age=27.0, Variable=Weight: t-statistic=-0.202, p-value=0.840 Age=27.0, Variable=Shucked Weight: t-statistic=-0.052, p-value=0.958 Age=27.0, Variable=Viscera Weight: t-statistic=-1.034, p-value=0.304 Age=27.0, Variable=Shell Weight: t-statistic=0.119, p-value=0.906 Age=27.0, Variable=Volume: t-statistic=-0.071, p-value=0.943 Age=27.0, Variable=Density: t-statistic=-0.035, p-value=0.972 Age: 29.0 ------------------------- Age=29.0, Variable=Length: t-statistic=0.132, p-value=0.895 Age=29.0, Variable=Diameter: t-statistic=-0.664, p-value=0.510 Age=29.0, Variable=Height: t-statistic=-0.499, p-value=0.620 Age=29.0, Variable=Weight: t-statistic=-0.533, p-value=0.596 Age=29.0, Variable=Shucked Weight: t-statistic=0.776, p-value=0.441 Age=29.0, Variable=Viscera Weight: t-statistic=-0.734, p-value=0.466 Age=29.0, Variable=Shell Weight: t-statistic=-0.719, p-value=0.475 Age=29.0, Variable=Volume: t-statistic=-0.665, p-value=0.509 Age=29.0, Variable=Density: t-statistic=0.240, p-value=0.811
df_1 = df.drop('id', axis=1)
correlation_matrix = df_1.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()
Because we know that there is differences in sizes in some ages, when we anlayze, we want to be sure we are not over-representing an age in our data.
# Setup for sampling
df_filtered = df_1[(~df['Sex'].isin(['I']))]
ages = list(range(4,28))
sample_size = 10000
# Calculate the sample size for each age group and sex
sample_size_per_age_sex = sample_size // (len(ages) * 2)
df_sample = pd.DataFrame()
# Loop through each age group
for age in ages:
for sex in ("M","F"):
age_sex_group_sample = df_filtered[(df_filtered['Age'] == age) & (df_filtered['Sex'] == sex)].sample(n=sample_size_per_age_sex, replace=True)
df_sample = pd.concat([df_sample, age_sex_group_sample])
# Reset the index
df_sample = df_sample.reset_index(drop=True)
df_age_5_11 = df_sample[df_sample['Age'].between(5, 11)]
# print(df_age_5_11.describe())
df_age_gt_18 = df_sample[df_sample['Age'] > 18]
# print(df_age_gt_18.describe())
fig, axes = plt.subplots(1, 2, figsize=(10, 4)) # Create subplots with 1 row and 2 columns
for i, d in enumerate([df_filtered, df_sample]):
ax = axes[i] # Get the corresponding axis for each subplot
sns.kdeplot(data=d, x='Age', hue='Sex', ax=ax)
ax.set_title('Age Distribution Density by Sex')
ax.set_xlabel('Age')
ax.set_ylabel('Frequency')
sex_categories = d['Sex'].unique()
legend_labels = [f'Sex: {sex}' for sex in sex_categories]
ax.legend(legend_labels, title='Sex')
plt.tight_layout() # Adjust spacing between subplots
plt.show()
C:\Users\darkc\anaconda3\lib\site-packages\seaborn\distributions.py:316: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning. warnings.warn(msg, UserWarning)
print("Age > 18")
print("-------------------------")
for var in variables:
group1 = df_age_gt_18[df_age_gt_18['Sex'] == 'M'][var]
group2 = df_age_gt_18[df_age_gt_18['Sex'] == 'F'][var]
t_statistic, p_value = stats.ttest_ind(group1, group2)
print(f"Variable: {var}")
print(f"For Males and Females: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
if p_value < 0.05:
print("Reject null")
print("-------------------------")
Age > 18 ------------------------- Variable: Length For Males and Females: t-statistic=-4.702, p-value=0.000 Reject null ------------------------- Variable: Diameter For Males and Females: t-statistic=-4.479, p-value=0.000 Reject null ------------------------- Variable: Height For Males and Females: t-statistic=-4.612, p-value=0.000 Reject null ------------------------- Variable: Weight For Males and Females: t-statistic=-4.100, p-value=0.000 Reject null ------------------------- Variable: Shucked Weight For Males and Females: t-statistic=-2.344, p-value=0.019 Reject null ------------------------- Variable: Viscera Weight For Males and Females: t-statistic=-5.526, p-value=0.000 Reject null ------------------------- Variable: Shell Weight For Males and Females: t-statistic=-4.056, p-value=0.000 Reject null ------------------------- Variable: Volume For Males and Females: t-statistic=-4.800, p-value=0.000 Reject null ------------------------- Variable: Density For Males and Females: t-statistic=1.877, p-value=0.061 -------------------------
print("Age 5-11")
print("-------------------------")
for var in variables:
group1 = df_age_5_11[df_age_5_11['Sex'] == 'M'][var]
group2 = df_age_5_11[df_age_5_11['Sex'] == 'F'][var]
t_statistic, p_value = stats.ttest_ind(group1, group2)
print(f"Variable: {var}")
print(f"For Males and Females: t-statistic={t_statistic:.3f}, p-value={p_value:.3f}")
if p_value < 0.05:
print("Reject null")
print("-------------------------")
Age 5-11 ------------------------- Variable: Length For Males and Females: t-statistic=-4.601, p-value=0.000 Reject null ------------------------- Variable: Diameter For Males and Females: t-statistic=-4.556, p-value=0.000 Reject null ------------------------- Variable: Height For Males and Females: t-statistic=-4.492, p-value=0.000 Reject null ------------------------- Variable: Weight For Males and Females: t-statistic=-3.614, p-value=0.000 Reject null ------------------------- Variable: Shucked Weight For Males and Females: t-statistic=-3.447, p-value=0.001 Reject null ------------------------- Variable: Viscera Weight For Males and Females: t-statistic=-4.102, p-value=0.000 Reject null ------------------------- Variable: Shell Weight For Males and Females: t-statistic=-3.733, p-value=0.000 Reject null ------------------------- Variable: Volume For Males and Females: t-statistic=-3.817, p-value=0.000 Reject null ------------------------- Variable: Density For Males and Females: t-statistic=-0.407, p-value=0.684 -------------------------
I tested my two subsets expecting one to be significant an the other not to. They are both significant. I assume this is because of un-equal samples across ages. this would nullify significance testing if age is not controlled for.
Re-did the analysis with after normalizing the sample across ages and found that density and shucked weight are the most similar in old the old crab group.
# Create PairGrid plot for dataset_5_11
g1 = sns.PairGrid(df_age_5_11, hue='Sex', palette=['blue', 'orange'], hue_order=['M', 'F'])
g1.map_upper(sns.scatterplot)
g1.map_lower(sns.scatterplot)
#g1.map_lower(sns.kdeplot)
g1.map_diag(sns.histplot)
g1.add_legend(title='Sex')
plt.tight_layout()
plt.show()
g2 = sns.PairGrid(df_age_gt_18, hue='Sex', palette=['blue', 'orange'], hue_order=['M', 'F'])
g2.map_upper(sns.scatterplot)
g2.map_lower(sns.scatterplot)
#g2.map_lower(sns.kdeplot)
g2.map_diag(sns.histplot)
g2.add_legend(title='Sex')
plt.tight_layout()
plt.show()
# Create a box plot to compare the distributions of Age for male and female crabs
sns.boxplot(x='Sex', y='Volume', data=df_age_5_11)
# Set plot labels
plt.xlabel('Sex')
plt.ylabel('Volume')
# Set plot title
plt.title('Distribution of Volume by Sex')
# Display the plot
plt.show()
# Create a box plot to compare the distributions of Age for male and female crabs
sns.boxplot(x='Sex', y='Volume', data=df_age_gt_18)
# Set plot labels
plt.xlabel('Sex')
plt.ylabel('Volume')
# Set plot title
plt.title('Distribution of Volume by Sex')
# Display the plot
plt.show()
Hard to spot differences in the sexes. Will need to explore by age in order to really dig into it.
grouped_data = df_sample.groupby('Sex')
fig, axes = plt.subplots(nrows=len(variables), ncols=1, figsize=(8, 16))
plt.subplots_adjust(hspace=0.7)
for i, variable in enumerate(variables): # Variable Iteration Loop
colors = ['pink', 'blue'] # Initialize colors for males and females
for j, (sex, group) in enumerate(grouped_data): # One line per sex
# Calculate the mean and confidence interval
mean = group.groupby('Age')[variable].mean()
lower_ci = group.groupby('Age')[variable].quantile(0.25)
upper_ci = group.groupby('Age')[variable].quantile(0.75)
axes[i].plot(mean.index, mean, label=sex, color=colors[j]) # mean
axes[i].fill_between(mean.index, lower_ci, upper_ci, alpha=0.3, color=colors[j]) # Confidence interval
axes[i].set_title(variable) # Set the title and labels for each variable subplot
axes[i].set_xlabel('Age')
axes[i].set_ylabel(variable)
axes[i].legend() # legend
plt.tight_layout() # Adjust the figure layout and spacing
plt.show() # Display the plot
males = df_age_5_11[df_age_5_11['Sex'] == 'M'].copy()
females = df_age_5_11[df_age_5_11['Sex'] == 'F'].copy()
males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females)
model_females = sm.OLS(y_females, X_females).fit()
# Model summaries
print("Male Regression Summary:")
print(model_males.summary())
print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.675
Model: OLS Adj. R-squared: 0.675
Method: Least Squares F-statistic: 3021.
Date: Wed, 28 Jun 2023 Prob (F-statistic): 0.00
Time: 17:05:40 Log-Likelihood: -4933.7
No. Observations: 1456 AIC: 9871.
Df Residuals: 1454 BIC: 9882.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -22.5299 0.775 -29.070 0.000 -24.050 -21.010
Age 5.1657 0.094 54.963 0.000 4.981 5.350
==============================================================================
Omnibus: 119.933 Durbin-Watson: 1.939
Prob(Omnibus): 0.000 Jarque-Bera (JB): 304.625
Skew: 0.461 Prob(JB): 7.10e-67
Kurtosis: 5.042 Cond. No. 34.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Female Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.660
Model: OLS Adj. R-squared: 0.660
Method: Least Squares F-statistic: 2820.
Date: Wed, 28 Jun 2023 Prob (F-statistic): 0.00
Time: 17:05:40 Log-Likelihood: -4923.0
No. Observations: 1456 AIC: 9850.
Df Residuals: 1454 BIC: 9861.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -19.1803 0.769 -24.930 0.000 -20.690 -17.671
Age 4.9545 0.093 53.104 0.000 4.771 5.138
==============================================================================
Omnibus: 73.362 Durbin-Watson: 1.887
Prob(Omnibus): 0.000 Jarque-Bera (JB): 148.565
Skew: 0.337 Prob(JB): 5.49e-33
Kurtosis: 4.412 Cond. No. 34.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7)
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')
plt.title('Regression Analysis (Age 12-17 vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()
df_age_12_18 = df_sample[(df_sample['Age'] >= 12) & (df_sample['Age'] <= 18)]
males = df_age_12_18[df_age_12_18['Sex'] == 'M'].copy()
females = df_age_12_18[df_age_12_18['Sex'] == 'F'].copy()
males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females)
model_females = sm.OLS(y_females, X_females).fit()
# Model summaries
print("Male Regression Summary:")
print(model_males.summary())
print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.014
Model: OLS Adj. R-squared: 0.014
Method: Least Squares F-statistic: 21.26
Date: Wed, 28 Jun 2023 Prob (F-statistic): 4.36e-06
Time: 17:05:51 Log-Likelihood: -5336.6
No. Observations: 1456 AIC: 1.068e+04
Df Residuals: 1454 BIC: 1.069e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 23.6153 1.876 12.590 0.000 19.936 27.295
Age 0.5715 0.124 4.611 0.000 0.328 0.815
==============================================================================
Omnibus: 40.345 Durbin-Watson: 1.901
Prob(Omnibus): 0.000 Jarque-Bera (JB): 49.456
Skew: 0.333 Prob(JB): 1.82e-11
Kurtosis: 3.609 Cond. No. 115.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Female Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.013
Model: OLS Adj. R-squared: 0.013
Method: Least Squares F-statistic: 19.58
Date: Wed, 28 Jun 2023 Prob (F-statistic): 1.04e-05
Time: 17:05:51 Log-Likelihood: -5439.1
No. Observations: 1456 AIC: 1.088e+04
Df Residuals: 1454 BIC: 1.089e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 23.7868 2.013 11.819 0.000 19.839 27.735
Age 0.5885 0.133 4.425 0.000 0.328 0.849
==============================================================================
Omnibus: 60.449 Durbin-Watson: 2.023
Prob(Omnibus): 0.000 Jarque-Bera (JB): 77.117
Skew: 0.427 Prob(JB): 1.80e-17
Kurtosis: 3.736 Cond. No. 115.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7)
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')
plt.title('Regression Analysis (Age 12-18 vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()
df_age_gt_18 = df_age_gt_18[(df_age_gt_18['Age'] != 25)]
males = df_age_gt_18[df_age_gt_18['Sex'] == 'M'].copy()
females = df_age_gt_18[df_age_gt_18['Sex'] == 'F'].copy()
males['Age_jittered'] = males['Age'] + np.random.uniform(-0.4, 0.4, len(males))
females['Age_jittered'] = females['Age'] + np.random.uniform(-0.4, 0.4, len(females))
# Regression analysis for males
X_males = males[['Age']]
y_males = males['Weight']
X_males = sm.add_constant(X_males)
model_males = sm.OLS(y_males, X_males).fit()
X_females = females[['Age']]
y_females = females['Weight']
X_females = sm.add_constant(X_females)
model_females = sm.OLS(y_females, X_females).fit()
print("Male Regression Summary:")
print(model_males.summary())
print("\nFemale Regression Summary:")
print(model_females.summary())
Male Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.000
Model: OLS Adj. R-squared: -0.001
Method: Least Squares F-statistic: 0.04924
Date: Wed, 28 Jun 2023 Prob (F-statistic): 0.824
Time: 17:09:44 Log-Likelihood: -6082.9
No. Observations: 1664 AIC: 1.217e+04
Df Residuals: 1662 BIC: 1.218e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 35.9300 1.997 17.994 0.000 32.014 39.847
Age -0.0193 0.087 -0.222 0.824 -0.190 0.152
==============================================================================
Omnibus: 29.696 Durbin-Watson: 1.901
Prob(Omnibus): 0.000 Jarque-Bera (JB): 36.696
Skew: 0.242 Prob(JB): 1.08e-08
Kurtosis: 3.543 Cond. No. 200.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Female Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: Weight R-squared: 0.006
Model: OLS Adj. R-squared: 0.006
Method: Least Squares F-statistic: 10.66
Date: Wed, 28 Jun 2023 Prob (F-statistic): 0.00112
Time: 17:09:44 Log-Likelihood: -6139.2
No. Observations: 1664 AIC: 1.228e+04
Df Residuals: 1662 BIC: 1.229e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 30.0529 2.065 14.550 0.000 26.002 34.104
Age 0.2944 0.090 3.265 0.001 0.118 0.471
==============================================================================
Omnibus: 66.022 Durbin-Watson: 1.907
Prob(Omnibus): 0.000 Jarque-Bera (JB): 81.250
Skew: 0.430 Prob(JB): 2.27e-18
Kurtosis: 3.658 Cond. No. 200.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Scatter plot
plt.figure(figsize=(4.5, 3.6))
sns.scatterplot(data=males, x='Age_jittered', y='Weight', color='blue', alpha=0.7)
sns.scatterplot(data=females, x='Age_jittered', y='Weight', color='pink', alpha=0.7)
# Regression lines and formulas for males and females
sns.lineplot(x=males['Age'], y=model_males.predict(X_males), color='blue', label=f'Males: {model_males.params[1]:.2f} * Age + {model_males.params[0]:.2f}')
sns.lineplot(x=females['Age'], y=model_females.predict(X_females), color='red', label=f'Females: {model_females.params[1]:.2f} * Age + {model_females.params[0]:.2f}')
plt.title('Regression Analysis (Age 19 + vs. Weight)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()
age_range = np.arange(0, 29)
# Weight calculation based on age range
weights_male = np.zeros_like(age_range, dtype=float)
weights_female = np.zeros_like(age_range, dtype=float)
for i, age in enumerate(age_range):
if age < 5:
weights_male[i] = 0
weights_female[i] = 0
elif age >= 5 and age <= 11:
weights_male[i] = 5.08 * age - 21.74
weights_female[i] = 4.71 * age - 17.56
elif age >= 12 and age <= 18:
weights_male[i] = 0.57 * age + 23.6
weights_female[i] = 0.59 * age + 23.7
else:
weights_male[i] = -0.02 * age + 35.9
weights_female[i] = 0.29 * age + 30
# Moving average for smoothing
window_size = 5
weights_male_smoothed = np.convolve(weights_male, np.ones(window_size)/window_size, mode='same')
weights_female_smoothed = np.convolve(weights_female, np.ones(window_size)/window_size, mode='same')
# Plot Graph
plt.figure(figsize=(8, 6))
plt.plot(age_range, weights_male_smoothed, label='Males')
plt.plot(age_range, weights_female_smoothed, label='Females')
plt.xlim(0, 25)
plt.title('Weight Prediction by Age (Smoothed)')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()
from scipy import stats
# Extract the 'Weight' data for each dataset
weight_gt_18 = df_age_gt_18['Weight']
weight_12_18 = df_age_12_18['Weight']
weight_5_11 = df_age_5_11['Weight']
# Perform ANOVA test
fvalue, pvalue = stats.f_oneway(weight_gt_18, weight_12_18, weight_5_11)
# Print the results
print("ANOVA Results:")
print("F-value:", fvalue)
print("p-value:", pvalue)
ANOVA Results: F-value: 2002.5639636706694 p-value: 0.0
# Combine the weight data from all three groups
weight_data = np.concatenate([weight_gt_18, weight_12_18, weight_5_11])
# orresponding group labels array
group_labels = ['18+', '12-18', '5-11']
group_labels = np.repeat(group_labels, [len(weight_gt_18), len(weight_12_18), len(weight_5_11)])
# Perform Tukey's HSD test
tukey_results = pairwise_tukeyhsd(weight_data, group_labels)
print(tukey_results)
Multiple Comparison of Means - Tukey HSD, FWER=0.05 ===================================================== group1 group2 meandiff p-adj lower upper reject ----------------------------------------------------- 12-18 18+ 3.7194 0.0 3.0861 4.3527 True 12-18 5-11 -12.7755 0.0 -13.4296 -12.1215 True 18+ 5-11 -16.4949 0.0 -17.1282 -15.8617 True -----------------------------------------------------